# Import Libraries
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(color_codes=True)
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
from scipy.spatial.distance import cdist, pdist
import plotly.express as px
cardData = pd.read_excel('Credit Card Customer Data.xlsx')
cardData.head()
# Shape of data
cardData.shape
rowCount = cardData.shape[0]
rowCount
# Data Types
cardData.dtypes
# Null check
cardData.isnull().sum()
# number of unique values
cardData.nunique()
# Unique values & counts i.e. value counts
for col in cardData.columns:
print('----------- Value Counts of ', col, ' -----------')
print(cardData.groupby(by=col)[col].value_counts())
print('-------------------------------------')
print()
# Statistical summary of numeric columns
cardData.describe().T
def getQuartileCounts(col):
print('Quartile Analysis of ', col)
print()
Min = cardData[col].min()
Max = cardData[col].max()
Std = cardData[col].std()
Mean = cardData[col].mean()
Median = cardData[col].median()
Q1 = cardData[col].quantile(q=0.25)
Q2 = cardData[col].quantile(q=0.50)
Q3 = cardData[col].quantile(q=0.75)
IQR = Q3-Q1
Minimum = Q1 - (1.5 * IQR)
Maximum = Q3 + (1.5 * IQR)
LeftOutlierCount = cardData[cardData[col]<Minimum].shape[0]
Q1Count = cardData[(cardData[col]>=Minimum) & (cardData[col]<=Q1)].shape[0]
Q2Count = cardData[(cardData[col]>Q1) & (cardData[col]<=Q2)].shape[0]
Q3Count = cardData[(cardData[col]>Q2) & (cardData[col]<=Q3)].shape[0]
Q4Count = cardData[(cardData[col]>Q3) & (cardData[col]<=Maximum)].shape[0]
RightOutlierCount = cardData[cardData[col]>Maximum].shape[0]
print('Min ', Min, " Value count: ", cardData[cardData[col] == Min].shape[0])
print('Max ', Max, " Value count: ", cardData[cardData[col] == Max].shape[0])
print()
print('Standard Deviation ', Std)
print('Mean ', Mean)
print('Median ', Median)
print()
print('25th percentile Q1 ', Q1)
print('50th percentile Q2 ', Q2)
print('75th percentile Q3 ', Q3)
print('IQR ', IQR)
print('Minimum = Q1 - 1.5*IQR = ', Minimum)
print('Maximum = Q3 + 1.5*IQR = ', Maximum)
print()
print('Left outlier count i.e. < Minimum ', LeftOutlierCount)
print('>= Minimum and <= Q1 count ', Q1Count)
print('> Q1 and <= Q2 count ', Q2Count)
print('> Q2 and <= Q3 count ', Q3Count)
print('> Q3 and <= Maximum ', Q4Count)
print('Right outlier count i.e. > Maximum ', RightOutlierCount)
print('Total Outliers ', (LeftOutlierCount+RightOutlierCount), ' ', (LeftOutlierCount+RightOutlierCount)*100/rowCount, '%', ' of total records')
print()
if(Mean > Median):
print('Distribution is Right Skewed because Mean > Median')
elif(Mean < Median):
print('Distribution is Left Skewed because Mean < Median')
else:
print('Distribution is Symmetric because Mean = Median')
def plotUnivariate(col, pltType):
if (pltType == 'box-dist'):
fig, axes = plt.subplots(1, 2, figsize=[15,5])
fig.tight_layout(pad=5.0)
getQuartileCounts(col)
sns.boxplot(data=cardData, x=col, ax=axes[0]);
sns.distplot(cardData[col], ax=axes[1]);
# Plot for Avg_Credit_Limit
plotUnivariate('Avg_Credit_Limit', 'box-dist')
# Plot for Total_Credit_Cards
plotUnivariate('Total_Credit_Cards', 'box-dist')
# Plot for Total_visits_bank
plotUnivariate('Total_visits_bank', 'box-dist')
# Plot for Total_visits_online
plotUnivariate('Total_visits_online', 'box-dist')
# Plot for Total_calls_made
plotUnivariate('Total_calls_made', 'box-dist')
# Let's drop the Serial Number and Customer Key columns (because these are Identifier columns)
# and create new dataset for analysis.
cardDataTrimmed = cardData.drop(columns=['Sl_No', 'Customer Key'])
# Correlation between the columns
cardDataTrimmed.corr()
# Pair plot
sns.pairplot(cardDataTrimmed, diag_kind='kde');
# Let's create a new feature based on different types of contacts
# This feature will help in segmentation and visualization
cardDataTotalContactsCopy = cardDataTrimmed.copy()
cardDataTotalContactsCopy["Total_Contacts"] = (cardDataTotalContactsCopy["Total_visits_bank"]
+ cardDataTotalContactsCopy["Total_visits_online"]
+ cardDataTotalContactsCopy["Total_calls_made"])
cardDataTotalContactsCopy.head()
# Scatter 3D Plot
px.scatter_3d(cardDataTotalContactsCopy, x='Avg_Credit_Limit', y='Total_Credit_Cards', z='Total_Contacts',
color='Total_Contacts')
random = 7
# Data Preprocessing - Scaling
scalar = preprocessing.StandardScaler()
cardDataKMeans = pd.DataFrame(scalar.fit_transform(cardDataTrimmed), columns=cardDataTrimmed.columns)
cardDataKMeans.head()
# K-means - Let's find the k using Average Distortions
distortions = []
cardDataDistortionsCopy = cardDataKMeans.copy()
kMeansClusters = range(1, 15)
for k in kMeansClusters:
model=KMeans(n_clusters=k, random_state=random)
model.fit(cardDataDistortionsCopy)
prediction=model.predict(cardDataDistortionsCopy)
distances = cdist(cardDataDistortionsCopy, model.cluster_centers_, 'euclidean')
minDistance = np.min(distances, axis=1)
averageMinDistance = sum(minDistance)/rowCount
distortions.append(averageMinDistance)
print('Average Distortions')
print()
print(distortions)
fig = plt.figure(figsize=[20,5])
plt.plot(kMeansClusters, distortions, 'bo-', )
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method - Average Distortions');
# K-means - Let's find the k using Silhouette Score
silhouette = []
cardDataSilhouetteCopy = cardDataKMeans.copy()
kMeansClusters = range(2, 15)
for k in kMeansClusters:
model=KMeans(n_clusters=k, random_state=random)
model.fit(cardDataSilhouetteCopy)
prediction=model.predict(cardDataSilhouetteCopy)
silhouette.append(silhouette_score(cardDataSilhouetteCopy, prediction))
print('Silhouette Score')
print()
print(silhouette)
plt.plot(kMeansClusters, silhouette, 'bo-')
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.title('Selecting k with the Elbow Method - Silhouette Score');
cardDataFinalModelCopy = cardDataKMeans.copy()
model=KMeans(n_clusters=5, random_state=random)
model.fit(cardDataFinalModelCopy)
labels=model.predict(cardDataFinalModelCopy)
print("Silhouette Score for Final KMeans model:", silhouette_score(cardDataFinalModelCopy, labels))
cardDataFinalModelCopy['Group'] = labels
cardDataFinalModelCopy.head()
# Scatter 3D Plot
px.scatter_3d(cardDataFinalModelCopy, x='Avg_Credit_Limit', y='Total_Credit_Cards', z='Group', color='Group')
# Box Plot
cardDataFinalModelCopy.boxplot(by='Group', figsize=(15,15));
# Data Preprocessing - Scaling
scalar = preprocessing.StandardScaler()
cardDataHierarchical = pd.DataFrame(scalar.fit_transform(cardDataTrimmed), columns=cardDataTrimmed.columns)
cardDataHierarchical.head()
#linkageMethods=['single','complete','average','ward','median']
hierarchicalResults = pd.DataFrame(columns=['Linkage Method', 'Cophenetic Correlation', 'Dendogram Distance For 5 Clusters',
'Silhouette Score'])
def plotHierarchicalClustering(mtd, dataSet):
linkageMatrix = linkage(dataSet, method=mtd, metric='euclidean')
cophenetic_correlation, cophenetic_distances = cophenet(linkageMatrix, pdist(dataSet))
print("Linkage Method:", mtd)
print("Cophenetic Correlation:", cophenetic_correlation)
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram - showing Top 15 clusters for Linkage Method "'+mtd+'"',
fontdict={'fontsize': 25})
dendrogram(linkageMatrix, truncate_mode='lastp', p=15)
dendogramDistance = 2
clusters = fcluster(linkageMatrix, dendogramDistance, criterion='distance')
if(len(set(clusters)) < 5):
while(len(set(clusters)) != 5):
dendogramDistance -= 0.01
clusters = fcluster(linkageMatrix, dendogramDistance, criterion='distance')
elif(len(set(clusters)) > 5):
while(len(set(clusters)) != 5):
dendogramDistance += 0.01
clusters = fcluster(linkageMatrix, dendogramDistance, criterion='distance')
print("Dendogram Distance where ", len(set(clusters)), " clusters found is:", dendogramDistance)
silhouetteScore = silhouette_score(dataSet, clusters)
print("Silhouette Score:", silhouetteScore)
dataSet['Group'] = clusters
# Box Plot
dataSet.boxplot(by='Group', figsize=(15,15));
resultFrame = pd.Series([mtd, cophenetic_correlation, dendogramDistance, silhouetteScore], index=hierarchicalResults.columns)
return resultFrame
result = plotHierarchicalClustering('single', cardDataHierarchical.copy())
hierarchicalResults = hierarchicalResults.append(result, ignore_index=True)
result = plotHierarchicalClustering('complete', cardDataHierarchical.copy())
hierarchicalResults = hierarchicalResults.append(result, ignore_index=True)
result = plotHierarchicalClustering('average', cardDataHierarchical.copy())
hierarchicalResults = hierarchicalResults.append(result, ignore_index=True)
result = plotHierarchicalClustering('ward', cardDataHierarchical.copy())
hierarchicalResults = hierarchicalResults.append(result, ignore_index=True)
result = plotHierarchicalClustering('median', cardDataHierarchical.copy())
hierarchicalResults = hierarchicalResults.append(result, ignore_index=True)
hierarchicalResults.sort_values(by='Cophenetic Correlation', ascending=False)
# KMeans - from model finalized above
model=KMeans(n_clusters=5, random_state=random)
model.fit(cardDataKMeans)
labels=model.predict(cardDataKMeans)
print("Silhouette Score for Final KMeans model:", silhouette_score(cardDataKMeans, labels))
# Hierarchical Clustering - from linkage method, dendogramic distance finalized above
linkageMatrix = linkage(cardDataHierarchical, method='average', metric='euclidean')
clusters = fcluster(linkageMatrix, 2.51, criterion='distance')
print("Silhouette Score for Final Hierarchical clustering:", silhouette_score(cardDataHierarchical, clusters))
# Assign groups on original trimmed data from final model labels in above steps.
cardDataCopyKMeans = cardDataTrimmed.copy()
cardDataCopyKMeans['Group'] = labels
cardDataCopyHierarchical = cardDataTrimmed.copy()
cardDataCopyHierarchical['Group'] = clusters
cardDataCopyKMeans.groupby(by='Group').describe().T
cardDataCopyKMeans.groupby(by='Group')['Avg_Credit_Limit'].plot(kind='kde');
cardDataCopyKMeans.groupby(by='Group')['Total_Credit_Cards'].plot(kind='kde');
cardDataCopyKMeans.groupby(by='Group')['Total_visits_bank'].plot(kind='kde');
cardDataCopyKMeans.groupby(by='Group')['Total_visits_online'].plot(kind='kde');
cardDataCopyKMeans.groupby(by='Group')['Total_calls_made'].plot(kind='kde');
cardDataCopyHierarchical.groupby(by='Group').describe().T
# Remove two records from group 3 and 5 to avoid error during plot as it requires more than one records.
df_For_Graph = cardDataCopyHierarchical[(cardDataCopyHierarchical["Group"] != 3) & (cardDataCopyHierarchical["Group"] != 5)]
df_For_Graph.groupby(by='Group')['Avg_Credit_Limit'].plot(kind='kde');
df_For_Graph.groupby(by='Group')['Total_Credit_Cards'].plot(kind='kde');
df_For_Graph.groupby(by='Group')['Total_visits_bank'].plot(kind='kde');
df_For_Graph.groupby(by='Group')['Total_visits_online'].plot(kind='kde');
df_For_Graph.groupby(by='Group')['Total_calls_made'].plot(kind='kde');